Exercise 2: Correlation is invariant under positive linear transformation (16 points)

Exercise 3: Plotting bars for the WHO data

Read the data into R (2 points)

url_prefix <- "https://raw.githubusercontent.com/michael-franke/intro-data-analysis/master/data_sets/"
WHO_data_url  <- str_c(url_prefix, "WHO.csv")

d <- read_csv(WHO_data_url)
glimpse(d)
## Observations: 194
## Variables: 13
## $ Country                       <chr> "Afghanistan", "Albania", "Algeria…
## $ Region                        <chr> "Eastern Mediterranean", "Europe",…
## $ Population                    <dbl> 29825, 3162, 38482, 78, 20821, 89,…
## $ Under15                       <dbl> 47.42, 21.33, 27.42, 15.20, 47.58,…
## $ Over60                        <dbl> 3.82, 14.93, 7.17, 22.86, 3.84, 12…
## $ FertilityRate                 <chr> "\r5.4\r", "\r1.75\r", "\r2.83\r",…
## $ LifeExpectancy                <dbl> 60, 74, 73, 82, 51, 75, 76, 71, 82…
## $ ChildMortality                <dbl> 98.5, 16.7, 20.0, 3.2, 163.5, 9.9,…
## $ CellularSubscribers           <dbl> 54.26, 96.39, 98.99, 75.49, 48.38,…
## $ LiteracyRate                  <chr> NA, NA, NA, NA, "\r70.1\r", "\r99.…
## $ GNI                           <chr> "\r1140\r", "\r8820\r", "\r8310\r"…
## $ PrimarySchoolEnrollmentMale   <chr> NA, NA, "\r98.2\r", "\r78.4\r", "\…
## $ PrimarySchoolEnrollmentFemale <chr> NA, NA, "\r96.4\r", "\r79.4\r", "\…

Make a bar plot with geom_bar (4 points)

d %>% ggplot(aes(x = Region)) + 
  geom_bar() +
  labs(
    x = "Region",
    y = "Number of countries in data set"
  )

Make a bar plot with geom_col (4 points)

countries_per_region <- d %>% group_by(Region) %>% 
  summarise(countries_per_region = n()) %>%
  ggplot(aes(fct_reorder(Region, countries_per_region, .desc=TRUE), y = countries_per_region)) + 
  geom_col() +
  labs(
    title = "Countries per region",
    x = "Region",
    y = "Number of countries in data set"
  )
countries_per_region

Plotting population per region (4 points)

population_per_region <- d %>% group_by(Region) %>% 
  summarise(total_population = sum(Population)) %>%
  ggplot(mapping = aes(x = Region, y = total_population)) + 
  geom_col() +
  labs(
    title = "Population per region",
    x = "Region",
    y = "Population"
  )
population_per_region

Combining plots (4 points)

plot_grid(countries_per_region, population_per_region, nrow = 2)

Exercise 4: Violin plots for the WHO data

Create summary statistics (4 points)

d %>% group_by(Region) %>% summarise(
  Min = min(ChildMortality),
  "0.25_quant" = quantile(ChildMortality, 0.25),
  "0.5_quant" = quantile(ChildMortality, 0.5),
  mean = mean(ChildMortality),
  "0.75_quant" = quantile(ChildMortality, 0.75),
  Max = max(ChildMortality)
  )
## # A tibble: 6 x 7
##   Region              Min `0.25_quant` `0.5_quant`  mean `0.75_quant`   Max
##   <chr>             <dbl>        <dbl>       <dbl> <dbl>        <dbl> <dbl>
## 1 Africa             13.1        58.6         81.8  84.0        102.  182. 
## 2 Americas            5.3        13.0         17.5  19.3         22.4  75.6
## 3 Eastern Mediterr…   7.4        11.2         18.4  40.2         69.8 147. 
## 4 Europe              2.2         3.8          4.8  10.1         10.7  58.3
## 5 South-East Asia     9.6        21           40.9  35.0         48.4  56.7
## 6 Western Pacific     2.9         9.55        22.4  24.7         34.1  71.8

Violin plots for group comparisons (of means) (6 points)

WHO_data <- d %>% group_by(Region) %>% 
  mutate(mean_cm = mean(ChildMortality)) %>% 
  ungroup()

WHO_data %>% ggplot() + geom_violin(aes(fct_reorder(Region, mean_cm), ChildMortality)) +
  labs(x = "Region")

Adding means and confidence intervals to the violin plot (6 points)

ci_means_cm <- d %>% group_by(Region) %>% 
  mutate(mean_cm = mean(ChildMortality)) %>%
  nest() %>% 
  summarise(CIs = map(data, function(d) bootstrapped_CI(d$ChildMortality))) %>%
  unnest(CIs)

WHO_data %>% ggplot() + 
  geom_violin(aes(fct_reorder(Region, mean_cm), ChildMortality, fill = Region), show.legend=FALSE) +
  geom_pointrange(mapping = aes(x = Region, y = mean, ymin = lower, ymax = upper), data = ci_means_cm) +
  labs(x = "Region")